In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pandas as pd
import re
import networkx as nx
import itertools
from pygsp import graphs, filters, plotting
from sklearn.cluster import KMeans
from sklearn import metrics
import pickle
import os
from collections import Counter
from sklearn import mixture
%load_ext autoreload
%autoreload 2
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
The dictionnaries built in this notebook are :
In [2]:
A_FEW_SECTIONS=True # True if you want to keep only a few sections, False to keep all sections
In [3]:
if A_FEW_SECTIONS:
enrol2=pd.read_pickle("../data/cleaned_enrol_STI.pickle")
courses=pd.read_pickle("../data/cleaned_courses_STI.pickle")
string='STI'
else:
enrol2=pd.read_pickle("../data/cleaned_enrol.pickle")
courses=pd.read_pickle("../data/cleaned_courses.pickle")
string='with_AR'
years=pd.read_pickle(os.path.join("Graphs","years.pkl"))
In [4]:
# See what it remains
Students=enrol2['PersonID'].unique() # The different students
Courses=enrol2['CourseCodes'].unique() # The different courses : USED in the the next part !!!
print('There are {} students in the dataset'.format(len(Students)))
print('There are {} different courses in the dataset'.format(len(Courses)))
enrol2['StudyPlanCode'].unique() # See which section it remains:
Out[4]:
In [5]:
# dictionnary of students and their courses:
students_courses_df=enrol2[['PersonID','CourseCodes']].set_index('PersonID')
students_courses_dico=students_courses_df.groupby(students_courses_df.index).apply(lambda x: x.to_dict('list'))
# dictionnary of courses and their indices:
courses_index_dico=dict(zip(Courses, np.arange(len(Courses)))) # dictionnary to link a course code to a number, USED in the
# following part !!!
In [6]:
weights_wt_students=np.zeros((len(Courses),len(Courses))) # weight matrix 1
In [7]:
w1=0.1 # weight for each edge
for person in Students: # for each student ...
for course1, course2 in itertools.combinations(students_courses_dico[person]['CourseCodes'], 2):
nb_years=min(years[course1], years[course2])
weights_wt_students[courses_index_dico[course1],courses_index_dico[course2]]+=(w1/nb_years) # add a weight between the courses
weights_wt_students[courses_index_dico[course2],courses_index_dico[course1]]+=(w1/nb_years) # add a weight between the courses
Check the matrix is symmetric:
In [8]:
np.nonzero(weights_wt_students-weights_wt_students.T)
Out[8]:
In [9]:
plt.figure(1,figsize=(10,10))
plt.spy(weights_wt_students)
Out[9]:
In [10]:
G1=nx.from_numpy_matrix(weights_wt_students)
In [11]:
plt.figure(1,figsize=(16,16))
pos1 = nx.spring_layout(G1)
nx.draw_networkx_nodes(G1, pos1, cmap=plt.get_cmap('jet')) # plot the nodes
nx.draw_networkx_labels(G1, pos1) # plot the labels
nx.draw_networkx_edges(G1, pos1) # plot the edges
plt.show()
In [12]:
len(list(nx.connected_component_subgraphs(G1))) # number of connected components
Out[12]:
In [13]:
plt.hist(list(nx.degree(G1).values())) # degree distribution
Out[13]:
In [14]:
hub_student=np.argmax(list(nx.degree(G1).values()))
print('The hub is:')
print(Courses[hub_student])
print(courses[courses.index.str.endswith(Courses[hub_student])].CourseTitleFR.tolist()[0])
In [15]:
# Save the weight matrix in a .pkl file:
pkl_file = open(os.path.join(os.getcwd(),"Graphs","students_graph_"+str(string)+".pkl"), "wb")
pickle.dump(weights_wt_students, pkl_file)
pkl_file.close()
In [16]:
courses2=courses[courses.index.isin(Courses)]
In [17]:
# Identify the study plan:
dump=courses2['StudyPlans'].str.replace(' -','').str.replace(r'^ ','').str.replace(r' $','').str.replace(' ;',';').str.replace('; ',';')
dump=dump.str.split(';',expand=True)
StudyPlans=[]
for i in range(9): #10 vs 9
StudyPlans+=dump[i].unique().tolist()
StudyPlans = sorted(list(set(list(filter(None,StudyPlans)))))
# remove elements beggining with ED, Ho, Au or UN:
StudyPlans=[elem for elem in StudyPlans if elem[:2] !='ED' and elem[:2]!='Ho' and elem[:2]!='Au' and elem[:2]!='UN']
print(len(StudyPlans))
In [18]:
# Build the dictionnary wrt to sections:
section_courses_dico={}
for plan in StudyPlans:
section_courses_dico[plan]=courses2[courses2['StudyPlans'].str.contains(plan)].index.tolist()
In [19]:
weights_wt_section=np.zeros((len(Courses),len(Courses))) # weight matrix 2
NB: "Courses" should be the list of the currently existing courses
In [20]:
w2=0.1 # weight for each edge
for plan in StudyPlans: # for each study plan ...
for course1, course2 in itertools.combinations(section_courses_dico[plan], 2):
weights_wt_section[courses_index_dico[course1],courses_index_dico[course2]]+=w2 # add a weight between the courses
weights_wt_section[courses_index_dico[course2],courses_index_dico[course1]]+=w2 # add a weight between the courses
Check the matrix is symmetric:
In [21]:
np.nonzero(weights_wt_section-weights_wt_section.T)
Out[21]:
In [22]:
plt.figure(1,figsize=(10,10))
plt.spy(weights_wt_section)
Out[22]:
In [23]:
G2=nx.from_numpy_matrix(weights_wt_section)
In [24]:
plt.figure(1,figsize=(16,16))
pos2 = nx.spring_layout(G2)
nx.draw_networkx_nodes(G2, pos2, cmap=plt.get_cmap('jet')) # plot the nodes
nx.draw_networkx_labels(G2, pos2) # plot the labels
nx.draw_networkx_edges(G2, pos2) # plot the edges
plt.show()
In [25]:
len(list(nx.connected_component_subgraphs(G2))) # number of connected components
Out[25]:
In [26]:
plt.hist(list(nx.degree(G2).values())) # degree distribution
Out[26]:
In [27]:
hub_section=np.argmax(list(nx.degree(G2).values()))
print('The hub is:')
print(Courses[hub_section])
print(courses[courses.index.str.endswith(Courses[hub_section])].CourseTitleFR.tolist()[0])
In [28]:
# Save the weight matrix in a .pkl file
pkl_file = open(os.path.join(os.getcwd(),"Graphs","section_graph_"+str(string)+".pkl"), "wb")
pickle.dump(weights_wt_section, pkl_file)
pkl_file.close()
In [29]:
ind3=np.where(courses2.ProfessorSCIPERs.isnull())[0]
courses3=courses2.drop(courses2.index[ind3])
Prof=courses3['ProfessorSCIPERs'].unique()
CoursesCodes=courses3.index.unique()
print('There are {} professors in the dataset'.format(len(Prof)))
print('There are {} different courses in the dataset'.format(len(CoursesCodes)))
In [30]:
Prof_scipers=[Prof[i].split(';') for i in range(len(Prof))]
Prof_scipers = list(set(itertools.chain.from_iterable(Prof_scipers)))
print('There are {} different professors in the dataset'.format(len(Prof_scipers)))
In [31]:
prof_courses_dico={}
for prof_sci in Prof_scipers:
prof_courses_dico[prof_sci]=courses3[courses3['ProfessorSCIPERs'].str.contains(prof_sci)].index.tolist()
In [32]:
weights_wrt_prof=np.zeros((len(Courses),len(Courses))) # weight matrix 3
In [33]:
w3=0.1 # weight for each edge
for prof_sci in Prof_scipers: # for each prof ...
for course1, course2 in itertools.combinations(prof_courses_dico[prof_sci], 2):
weights_wrt_prof[courses_index_dico[course1],courses_index_dico[course2]]+=w3 # add a weight between the courses
weights_wrt_prof[courses_index_dico[course2],courses_index_dico[course1]]+=w3 # add a weight between the courses
In [34]:
# Check the matrix is symmetric:
np.nonzero(weights_wrt_prof-weights_wrt_prof.T)
Out[34]:
In [35]:
plt.figure(1,figsize=(10,10))
plt.spy(weights_wrt_prof)
Out[35]:
In [36]:
G3=nx.from_numpy_matrix(weights_wrt_prof)
In [37]:
plt.figure(1,figsize=(16,16))
pos3 = nx.spring_layout(G3)
nx.draw_networkx_nodes(G3, pos3, cmap=plt.get_cmap('jet')) # plot the nodes
nx.draw_networkx_labels(G3, pos3) # plot the labels
nx.draw_networkx_edges(G3, pos3) # plot the edges
plt.show()
In [38]:
len(list(nx.connected_component_subgraphs(G3))) # number of connected components
Out[38]:
In [39]:
Gcc=sorted(nx.connected_component_subgraphs(G3), key = len, reverse=True)[0]
print(len(Gcc)) # size of the giant component
print(list(Gcc)) # list of the nodes of the giant components
In [40]:
print(Courses[Gcc]) # course codes of the giant components
print([courses2[courses2.index.str.endswith(Courses[Gcc][p])].CourseTitleFR.tolist() for p in range(len(Gcc))])
In [41]:
print(list(nx.connected_components(G3))) # display the different components
In [42]:
# distribution of the sizes of the connected components:
plt.hist([len(list(nx.connected_components(G3))[p]) for p in range(len(list(nx.connected_component_subgraphs(G3))))])
Out[42]:
In [43]:
plt.hist(list(nx.degree(G3).values())) # degree distribution
Out[43]:
In [44]:
hub_prof=np.argmax(list(nx.degree(G3).values()))
print('The hub is:')
print(Courses[hub_prof])
print(courses[courses.index.str.endswith(Courses[hub_prof])].CourseTitleFR.tolist()[0])
print('The neighbors of this hub are:')
ind_hub_prof=np.where(weights_wrt_prof[hub_prof,:]>0)[0]
print(Courses[ind_hub_prof])
print([courses[courses.index.str.endswith(Courses[ind_hub_prof][p])].CourseTitleFR.tolist() for p in range(len(ind_hub_prof))])
In [45]:
# Save the weight matrix in a .pkl file:
pkl_file = open(os.path.join(os.getcwd(),"Graphs","prof_graph_"+str(string)+".pkl"), "wb")
pickle.dump(weights_wrt_prof, pkl_file)
pkl_file.close()
In [46]:
ind2=np.where(courses2.AssistantSCIPERs.isnull())[0]
courses4=courses2.drop(courses2.index[ind2])
Assistants=courses4['AssistantSCIPERs'].unique()
CoursesCodes=courses4.index.unique()
print('There are {} assistants in the dataset'.format(len(Assistants)))
print('There are {} different courses in the dataset'.format(len(CoursesCodes)))
In [47]:
Assistant_scipers=[Assistants[i].split(';') for i in range(len(Assistants))]
Assistant_scipers = list(set(itertools.chain.from_iterable(Assistant_scipers)))
print('There are {} different assistants in the dataset'.format(len(Assistant_scipers)))
In [48]:
assistants_courses_dico={}
for ass_sci in Assistant_scipers:
assistants_courses_dico[ass_sci]=courses4[courses4['AssistantSCIPERs'].str.contains(ass_sci)].index.tolist()
In [49]:
weights_wrt_assistants=np.zeros((len(Courses),len(Courses))) # weight matrix 4
In [50]:
w4=0.1 # weight for each edge
for ass_sci in Assistant_scipers: # for each prof ...
for course1, course2 in itertools.combinations(assistants_courses_dico[ass_sci], 2):
weights_wrt_assistants[courses_index_dico[course1],courses_index_dico[course2]]+=w4 # add a weight between the courses
weights_wrt_assistants[courses_index_dico[course2],courses_index_dico[course1]]+=w4 # add a weight between the courses
In [51]:
np.nonzero(weights_wrt_assistants-weights_wrt_assistants.T)
Out[51]:
In [52]:
plt.figure(1,figsize=(10,10))
plt.spy(weights_wrt_assistants)
Out[52]:
In [53]:
G4=nx.from_numpy_matrix(weights_wrt_assistants)
In [54]:
plt.figure(1,figsize=(16,16))
pos4 = nx.spring_layout(G4)
nx.draw_networkx_nodes(G4, pos4, cmap=plt.get_cmap('jet')) # plot the nodes
nx.draw_networkx_labels(G4, pos4) # plot the labels
nx.draw_networkx_edges(G4, pos4) # plot the edges
plt.show()
In [55]:
len(list(nx.connected_component_subgraphs(G4))) # number of connected components
Out[55]:
In [56]:
Gcc=sorted(nx.connected_component_subgraphs(G4), key = len, reverse=True)[0]
print(len(Gcc)) # size of the giant component
list(Gcc) # nodes of the giant component
Out[56]:
In [57]:
# distribution of the sizes of the connected components:
plt.hist([len(list(nx.connected_components(G4))[p]) for p in range(len(list(nx.connected_component_subgraphs(G4))))])
Out[57]:
In [58]:
plt.hist(list(nx.degree(G4).values())) # degree distribution
Out[58]:
In [59]:
hub_assistant=np.argmax(list(nx.degree(G4).values()))
print('The hub is:')
print(Courses[hub_assistant])
print(courses[courses.index.str.endswith(Courses[hub_assistant])].CourseTitleFR.tolist()[0])
print('The neighbors of this hub are:')
ind_hub_assistants=np.where(weights_wrt_assistants[hub_assistant,:]>0)[0]
print(Courses[ind_hub_assistants])
print([courses[courses.index.str.endswith(Courses[ind_hub_assistants][p])].CourseTitleFR.tolist() for p in range(len(ind_hub_assistants))])
In [60]:
# Save the weight matrix in a .pkl file:
pkl_file = open(os.path.join(os.getcwd(),"Graphs","assistants_graph_"+str(string)+".pkl"), "wb")
pickle.dump(weights_wrt_assistants, pkl_file)
pkl_file.close()